from langchain.text_splitter import CharacterTextSplitter
from tool import custom_print

"""
Split by character:
    官方文档：https://python.langchain.com/docs/modules/data_connection/document_transformers/text_splitters/character_text_splitter
    
    1.CharacterTextSplitter 的默认分割符号是换行符号 \n\n
    2.CharacterTextSplitter 的 separator 属性只能设置一种字符进行分割
    
"""

# 默认分割("\n\n")
default_text_splitter = CharacterTextSplitter(
    chunk_size=160,
    chunk_overlap=20,
    length_function=len,
)

with open('test2.txt', encoding='utf-8') as f:
    state_of_the_union = f.read()

default_texts = default_text_splitter.create_documents([state_of_the_union])

custom_print.print_all(default_texts)
